# - sta izaberes kao metriku za performanse, samo moras da objasnis zbog cega (zasto je to bitno da se racuna)
# - za klasterizciju moze PCA, ili da se samo nadju najdominantnija obelzja pa samo ona da se vizualizuju
# - sto se tice ispitivanja klastera treba izracunati srednje vrednosti, varijanse itd za svaki klaster
# - moze box dijagram za vizualizaciju klasterizacije
Globalno horizontalno zračenje (GHI), ili ukupna količina kratkotalasnog sunčevog zračenja koje dospe na zemljinu površinu, predstavlja ključni parametar za solarne elektrane. Ova vrednost, koja obuhvata i direktno i difuzno sunčevo zračenje, trenutno se meri geo-stacionarnim satelitima.
Predviđanje perioda sa visokim GHI indeksom omogućava solarnim elektranama proaktivni pristup optimizaciji rada. To se ogleda u:
Skup podataka odnosi se na merenje GHI zracenja na podrucju Vijetnama i javno je dostupan na adresi https://data.openei.org/s3_viewer?bucket=nrel-pds-nsrdb&prefix=vietnam%2F, koje je objavila NREL (National Solar Radiation Database) i dostupno je pod licencom Creative Commons Attribution 3.0 United States License.
install.packages("BiocManager")
BiocManager::install("rhdf5")
install.packages("kableExtra")
install.packages("ggpubr")
install.packages("sparklyr")
install.packages("caret")
install.packages("webshot2")
library(rhdf5)
library(ggplot2)
library(dplyr)
library(magrittr)
library(lubridate)
library(kableExtra)
library(ggpubr)
library(sparklyr)
library(caret)
library(webshot2)
set.seed(10)
spark_install(version="3.3.2")
Sys.setenv(JAVA_HOME="/usr/lib/jvm/java-11-openjdk")
knitr::opts_knit$set(root.dir = "/mnt/StorageSpace/StorageSpace/repositories/ghi-predicting")
# Ovim prosirujemo java heap sparka da bi mogli da smestimo nas set podataka u njega
conf <- spark_config()
conf$`sparklyr.shell.driver-memory` <- "16G"
conf$spark.memory.fraction <- 0.9
sc <- spark_connect(master = "local", version="3.3.2", config = conf)
Temperaturu vazduha i brzinu vetra takodje je moguce meriti pomocu satelita
h5_file <- H5Fopen("./dataset/vietnam_2016.h5")
air_temp <- h5read(h5_file, "/air_temperature") # celsius
coordinates <- h5read(h5_file, "/coordinates") # angle
ghi <- h5read(h5_file, "/ghi") #W/m^2
meta <- h5read(h5_file, "/meta") # elevation: m
time_index <- h5read(h5_file, "/time_index") # date
wind_speed <- h5read(h5_file, "/wind_speed") # m/s
kable_out <- knitr::kable(h5ls(h5_file) %>%
select(name,dclass,dim),
format = "html",
) %>%
kableExtra::kable_styling(bootstrap_options = "bordered", full_width = F, font_size = 16)
save_kable(kable_out, file = "tables/hdf5_structure.png")
Zakljucujemo da imamo 75361 redova zbog 75361 koordinate koje se posmatraju i svaki red sadrzi onoliko kolona koliko je bilo merenja u razlicitim vremenskim trenucima.
Kolona ima koliko i sati koliko je posmatranje trajalo: 366 * 24 = 8784
Budući da nas zanima klasifikacija zračenja u letnjem periodu filtriramo podatke koji su izmereni od 21. juna (172. dan) do 22. septembra (266. dan)
summer_start <- 172 # day number
summer_end <- 266
summer_hours <- (summer_start * 24) : (summer_end * 24)
sunrise <- 5
sunset <- 19
sunny_hours <- summer_hours[summer_hours %% 24 %in% (sunrise + 1):(sunset + 1)]
coordinate_step <- 10
#Δlatitude = 5 degrees (differential of latitude)
#Δlongitude = 5 degrees (differential of longitude)
# Earth's average radius ≈ 6,371 kilometers
# Area = 5° * 5° * 6,371 km ≈ 159,275 square kilometers
time_measures_per_coor <- length(sunny_hours)
coor_num = nrow(air_temp)
selected_rows = seq(1, coor_num, by = coordinate_step)
coor_num <- length(selected_rows)
Kada smo zakljucili opseg podataka (selected_rows, sunny_hours) za letnji period u suncem obasjanim intervalima, ekstrahujemo podatke iz h5 matrice i pretvaramo ih u tabelarni format pogodan za dalje koriscenje.
id <- 1:(coor_num * time_measures_per_coor)
latitude <- rep(coordinates[1, selected_rows], each = time_measures_per_coor)
longitude <- rep(coordinates[2, selected_rows], each = time_measures_per_coor)
elevation <- rep(meta[selected_rows,"elevation"], each = time_measures_per_coor)
rm(coordinates)
time <- rep(time_index[sunny_hours], coor_num)
air_temp <- c(air_temp[selected_rows, sunny_hours])
wind_speed <- c(wind_speed[selected_rows, sunny_hours])
ghi <- c(ghi[selected_rows, sunny_hours])
df <- data.frame(id, time, latitude, longitude, elevation, air_temp, wind_speed, ghi)
df <- df %>%
filter(ghi != 0) %>%
mutate(time = ymd_hms(strptime(time, "%Y-%m-%d %H:%M:%S")))
# closing all HDF5 handles
h5closeAll()
# removing helper arrays
rm(id, time, latitude, longitude, elevation, air_temp, wind_speed, ghi, h5_file, meta)
# removing helper values
rm(coor_num, coordinate_step, selected_rows, summer_end, summer_hours, summer_start, sunny_hours, sunrise,
sunset, time_index, time_measures_per_coor)
Sa ovih grafikona može se zaključiti da nadmorska visina nema posebnu povezanost sa bilo kojom preostalom promenljivom dok se mogu uočiti trendovi u odnosima brzine vetra, temperature vazduha i GHI.
Klasifikacija je vršena za GHI, gde bi se njegova vrednost klasifikovala kao normalna ili visoka (NORMAL, HIGH), što bi bilo od značaja elektrokompanijama za proaktivan pristup održavanju postrojenja, optimizaciji resursa i raspodeli posla.
Kao prag za visoko svetlosno zračenje iako se generalno uzima 600 W/m², na osnovu prethodno iscrtanih grafikona, odabiramo, vrednost trećeg kvartila, kako bi se bolje prilagodili našem konkretnom skupu podataka vezanom za Tajvan.
threshold <- quantile(df$ghi, probs = 0.75)
threshold
df %<>%
mutate(ghi_c = factor(ifelse(ghi > threshold, "HIGH", "NORMAL")))
k_folds <- 5
# Promesamo podatke
df_shuffled <- df[sample(nrow(df)), ]
df_shuffled %<>%
mutate(id = 1:nrow(df)) # radi laseg samplovanja
# Prebacimo u tbl_spark zbog dalje obrade
df_spark <- copy_to(sc, df_shuffled, "df_spark", overwrite = TRUE)
# ML funkcije ne podrzavaju timestamp, pa se vreme prebacuje u numeric
df_spark %<>%
mutate(time_numeric = as.numeric(time))
formula <- ghi_c ~ air_temp + wind_speed + longitude + latitude + time_numeric
# Funkcija za deljenje podataka za k-fold validaciju
split_data <- function(data, fold_num, fold_size) {
test_start <- (fold_num - 1) * fold_size + 1
test_end <- test_start + fold_size - 1
test_set <- data %>%
filter(id >= test_start & id <= test_end)
train_set <- data %>%
filter(!(id >= test_start & id <= test_end))
return(list(train_set = train_set, test_set = test_set))
}
# Funkcija za izracunavanje performansi specifikacije
calc_perf <- function(model, test_set){
pred <- ml_predict(model, test_set)
confMat <- confusionMatrix(factor(pull(test_set, "ghi_c")), factor(pull(pred, "predicted_label")))
confMat
accuracy <- confMat$overall[["Accuracy"]]
sensitivty <- confMat$byClass[["Sensitivity"]]
specificity <- confMat$byClass[["Specificity"]]
return(list(accuracy = accuracy, sensitivty = sensitivty,
specificity = specificity))
}
# Opste potrebni podaci za visestruku validaciju
row_num <- count(df_spark) %>% collect()
row_num <- row_num[[1,1]]
fold_size <- row_num %/% k_folds
Podešavani hiperparametri:
start_time <- proc.time()
reg_parameter <- c(0, 0.001, 0.1)
max_iterations <- c(100, 500, 1000)
accuracies <- numeric(3)
sensitivities <- numeric(3)
specificities <- numeric(3)
for(scenario in 1:3){
accuracy_sum <- 0
sensitivity_sum <- 0
specificity_sum <- 0
for(i in 1:k_folds){
paritioned_data <- split_data(df_spark, i, fold_size)
logreg <- ml_logistic_regression(paritioned_data$train_set,
formula,
reg_param = reg_parameter[scenario],
max_iter = max_iterations[scenario],
family = "binomial")
performance <- calc_perf(logreg, paritioned_data$test_set)
accuracy_sum <- accuracy_sum + performance[["accuracy"]]
sensitivity_sum <- sensitivity_sum + performance[["sensitivty"]]
specificity_sum <- specificity_sum + performance[["specificity"]]
}
accuracies[scenario] <- accuracy_sum / k_folds
sensitivities[scenario] <- sensitivity_sum / k_folds
specificities[scenario] <- specificity_sum / k_folds
}
# Prikaz rezultata
results_logreg <- data.frame(accuracies,
sensitivities,
specificities)
rownames(results_logreg) <- c("Scenario 1", "Scenario 2", "Scenario 3")
colnames(results_logreg) <- c("Accuracy", "Sensitivity", "Specificity")
kable_out <- knitr::kable(results_logreg,
label = "Prikaz rezultata logisticke regresije",
format = "html"
) %>%
kableExtra::kable_styling(bootstrap_options = "bordered", full_width = FALSE, font_size = 16)
end_time <- proc.time()
elapsed = end_time-start_time
print(paste("Elapsed time:",round(elapsed[3] / 60,2)))
save_kable(kable_out, file = "tables/results_logreg.png")
Na osnovu rezultata da se zaključiti da parametri u 3. scenariju postižu najbolje performanse
Podešavani hiperparametri:
start_time <- proc.time()
#### Obucanvanje i validacija
reg_parameter <- c(0, 0.001, 0.1)
max_iterations <- c(100, 500, 1000)
accuracies <- numeric(3)
sensitivities <- numeric(3)
specificities <- numeric(3)
for(scenario in 1:3){
accuracy_sum <- 0
sensitivity_sum <- 0
specificity_sum <- 0
for(i in 1:k_folds){
paritioned_data <- split_data(df_spark, i, fold_size)
lsvc <- ml_logistic_regression(paritioned_data$train_set,
formula,
reg_param = reg_parameter[scenario],
max_iter = max_iterations[scenario])
performance <- calc_perf(lsvc, paritioned_data$test_set)
accuracy_sum <- accuracy_sum + performance[["accuracy"]]
sensitivity_sum <- sensitivity_sum + performance[["sensitivty"]]
specificity_sum <- specificity_sum + performance[["specificity"]]
}
accuracies[scenario] <- accuracy_sum / k_folds
sensitivities[scenario] <- sensitivity_sum / k_folds
specificities[scenario] <- specificity_sum / k_folds
}
# Prikaz rezultata
results_lsvc <- data.frame(accuracies,
sensitivities,
specificities)
rownames(results_lsvc) <- c("Scenario 1", "Scenario 2", "Scenario 3")
colnames(results_lsvc) <- c("Accuracy", "Sensitivity", "Specificity")
kable_out <- knitr::kable(results_lsvc,
label = "Prikaz rezultata lsvc",
format = "html"
) %>%
kableExtra::kable_styling(bootstrap_options = "bordered", full_width = FALSE, font_size = 16)
end_time <- proc.time()
elapsed = end_time-start_time
print(paste("Elapsed time:",round(elapsed[3] / 60,2)))
save_kable(kable_out, file = "tables/results_lsvc.png")
Na osnovu rezultata da se zaključiti da parametri u 2. scenariju postižu najbolje performanse.
start_time <- proc.time()
#### Obucanvanje i validacija
max_depth <- c(5,10,20)
min_instances_per_node <- c(2,5,10)
accuracies <- numeric(3)
sensitivities <- numeric(3)
specificities <- numeric(3)
for(scenario in 1:3){
accuracy_sum <- 0
sensitivity_sum <- 0
specificity_sum <- 0
for(i in 1:k_folds){
paritioned_data <- split_data(df_spark, i, fold_size)
dt <- ml_decision_tree_classifier(paritioned_data$train_set,
formula,
max_depth = max_depth[scenario],
min_instances_per_node = min_instances_per_node[scenario]
)
performance <- calc_perf(dt, paritioned_data$test_set)
accuracy_sum <- accuracy_sum + performance[["accuracy"]]
sensitivity_sum <- sensitivity_sum + performance[["sensitivty"]]
specificity_sum <- specificity_sum + performance[["specificity"]]
}
accuracies[scenario] <- accuracy_sum / k_folds
sensitivities[scenario] <- sensitivity_sum / k_folds
specificities[scenario] <- specificity_sum / k_folds
}
# Prikaz rezultata
results_dt <- data.frame(accuracies,
sensitivities,
specificities)
rownames(results_dt) <- c("Scenario 1", "Scenario 2", "Scenario 3")
colnames(results_dt) <- c("Accuracy", "Sensitivity", "Specificity")
kable_out <- knitr::kable(results_dt,
label = "Prikaz rezultata stabla odlucivanja",
format = "html"
) %>%
kableExtra::kable_styling(bootstrap_options = "bordered", full_width = FALSE, font_size = 16)
end_time <- proc.time()
elapsed = end_time-start_time
print(paste("Elapsed time:",round(elapsed[3] / 60,2)))
save_kable(kable_out, file = "tables/results_dt.png")
Na osnovu rezultata da se zaključiti da parametri u 3. scenariju postižu najbolje performanse.
Nakon analize svih rezultata na nivou svih metoda zaključujemo da najbolje performanse postiže 3. scenario metode stabla odlučivanja.
df_spark_normalized <- copy_to(sc, df_shuffled, "df_spark_normalized", overwrite = TRUE)
df_spark_normalized %<>%
mutate(time_numeric = as.numeric(time))
features <- c("air_temp", "wind_speed", "time_numeric", "longitude", "latitude")
means <- numeric(length(features))
stds <- numeric(length(features))
names(means) <- features
names(stds) <- features
for(col in features){
summary_stats <- df_spark_normalized %>%
summarise(
mean_value = mean(!!sym(col), na.rm = TRUE),
std_value = sd(!!sym(col), na.rm = TRUE)
) %>%
collect()
means[col] <- summary_stats$mean_value
stds[col] <- summary_stats$std_value
}
for (col in features) {
mean <- means[col]
std <- stds[col]
df_spark_normalized <- df_spark_normalized %>%
mutate(!!sym(col) := (!!sym(col) - mean) / std)
}
Crtamo elbow plot kako bi odredili najpogodnije k za klasterizaciju
start_time <- proc.time()
formula_clustering <- ghi_c ~ air_temp + wind_speed + time_numeric + longitude + latitude
sum_of_squared_errors <- numeric(15)
for(k in 2:15){
clustered <- ml_bisecting_kmeans(df_spark_normalized, formula = formula_clustering, k = k)
sum_of_squared_errors[k] <- clustered$cost
}
# Prikaz
k_errors_df <- data.frame(1:15, sum_of_squared_errors)
colnames(k_errors_df) <- c("K", "sum_of_squared_errors")
# Create the ggplot object
k_elbow_plot <- ggplot(k_errors_df) +
geom_line(mapping = aes(x = K, y = sum_of_squared_errors)) +
labs(title = "Sum of squared errors over number of clusters",
x = "Number of clusters",
y = "Sum of squared errors ") +
theme_bw() +
scale_x_continuous(breaks = seq(1, 15, by = 1)) # Set breaks from 1 to 15 with a step of 1
k_elbow_plot
end_time <- proc.time()
elapsed = end_time-start_time
print(paste("Elapsed time:",round(elapsed[3] / 60,2)))
Klasterizacija prema 2 scenarija:
Izabran je Bisecting KMeans jer:
cluster_k_6 <- ml_bisecting_kmeans(df_spark_normalized, formula = formula_clustering, k = 6)
cluster_k_12 <- ml_bisecting_kmeans(df_spark_normalized, formula = formula_clustering, k = 12)
cluster_k_6
cluster_k_12
predicted_clusters_6 <- ml_predict(cluster_k_6, df_spark_normalized)
predicted_clusters_6 %<>%
select(prediction) %>%
rename("cluster_6" = prediction)
predicted_clusters_12 <- ml_predict(cluster_k_12, df_spark_normalized)
predicted_clusters_12 %<>%
select(prediction) %>%
rename("cluster_12" = prediction)
df_spark <- sdf_bind_cols(df_spark,predicted_clusters_6, predicted_clusters_12)
df_spark
long_lat_clust_6 <- ggplot(df_spark, aes(x = longitude, y = latitude, color = cluster_6)) +
geom_point() +
labs(title = "Scatter Plot with Cluster Colors",
x = "Longitude",
y = "Latitude")
long_lat_clust_6
spark_disconnect(sc)